{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": true,
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "import random\n",
    "import pandas as pd\n",
    "import json\n",
    "\n",
    "src_ratings = pd.read_csv(f'mindreader/ratings.csv')\n",
    "src_ratings = src_ratings[src_ratings.sentiment != 0]\n",
    "\n",
    "user_ratings = src_ratings[(src_ratings.sentiment == 1) & (src_ratings.isItem)][['uri', 'userId']].groupby('userId').count()\n",
    "user_ratings.columns = ['pos_ratings']\n",
    "\n",
    "src_ratings = src_ratings.merge(user_ratings, on='userId')\n",
    "src_ratings = src_ratings[src_ratings.pos_ratings > 2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "outputs": [
    {
     "name": "stdout",
     "text": [
      "len(item_indices)=3013\n",
      "len(ratings)=40731\n"
     ],
     "output_type": "stream"
    }
   ],
   "source": [
    "from generic_data_loader import Rating\n",
    "\n",
    "entity_idx = dict()\n",
    "entity_count = 0\n",
    "\n",
    "user_idx = dict()\n",
    "user_count = 0\n",
    "\n",
    "ratings = list()\n",
    "\n",
    "# Convert rows to ratings\n",
    "for index, row in src_ratings.iterrows():\n",
    "    if row.uri not in entity_idx:\n",
    "        entity_idx[row.uri] = entity_count\n",
    "        entity_count += 1\n",
    "        \n",
    "    if row.userId not in user_idx:\n",
    "        user_idx[row.userId] = user_count\n",
    "        user_count += 1\n",
    "    \n",
    "    rating = Rating(user_idx[row.userId], entity_idx[row.uri], row.sentiment, row.isItem)\n",
    "    ratings.append(rating)\n",
    "\n",
    "item_indices = set()\n",
    "for rating in ratings:\n",
    "    if rating.is_movie_rating:\n",
    "        item_indices.add(rating.e_idx)\n",
    "\n",
    "print(f'{len(item_indices)=}')\n",
    "print(f'{len(ratings)=}')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "outputs": [
    {
     "name": "stdout",
     "text": [
      "done\n"
     ],
     "output_type": "stream"
    }
   ],
   "source": [
    "de_training = list()\n",
    "movie_training = list()\n",
    "test = list()\n",
    "validation = list()\n",
    "\n",
    "def convert(rating_list):\n",
    "    return [{\"u_idx\": rating.u_idx,\n",
    "             \"e_idx\": rating.e_idx,\n",
    "             \"rating\": rating.rating,\n",
    "             \"is_movie_rating\": rating.is_movie_rating} for rating in rating_list]\n",
    "\n",
    "for user in user_idx.values():\n",
    "    user_ratings = [rating for rating in ratings if rating.u_idx == user]\n",
    "    \n",
    "    interacted_items = [rating.e_idx for rating in user_ratings if rating.is_movie_rating]\n",
    "    uninteracted_items = item_indices.difference(set(interacted_items))\n",
    "    \n",
    "    positive_interacted_items = [rating.e_idx for rating in user_ratings if rating.is_movie_rating and rating.rating == 1]\n",
    "    samples = random.sample(positive_interacted_items, 2)\n",
    "    val_sample, test_sample = samples\n",
    "    assert val_sample != test_sample\n",
    "    assert val_sample in interacted_items\n",
    "    assert test_sample in interacted_items\n",
    "    \n",
    "    train = [rating for rating in user_ratings if rating.e_idx not in samples]\n",
    "    assert len(train) + 2 == len(user_ratings)\n",
    "    assert val_sample not in [rating.e_idx for rating in train]\n",
    "    assert test_sample not in [rating.e_idx for rating in train]\n",
    "    \n",
    "    train_movies_only = [rating for rating in train if rating.is_movie_rating]\n",
    "    \n",
    "    # Add to lists\n",
    "    test.append((user, (test_sample, list(uninteracted_items))))\n",
    "    validation.append((user, (val_sample, list(uninteracted_items))))\n",
    "    de_training.append((user, convert(train)))\n",
    "    movie_training.append((user, convert(train_movies_only)))\n",
    "    \n",
    "with open('new_data/with/0.json', 'w') as fp:\n",
    "    json.dump({'training': de_training, 'testing': test, 'validation': validation}, fp)\n",
    "    \n",
    "with open('new_data/without/0.json', 'w') as fp:\n",
    "    json.dump({'training': movie_training, 'testing': test, 'validation': validation}, fp)\n",
    "\n",
    "print('done')"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "outputs": [],
   "source": [
    "# Dump meta\n",
    "with open('new_data/meta.json', 'w') as fp:\n",
    "    json.dump({'e_idx_map': entity_idx}, fp)\n"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "source": [],
    "metadata": {
     "collapsed": false
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}